In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
import shap
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold, cross_val_predict
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from mpl_toolkits.axes_grid1 import make_axes_locatable
sns.set_context("paper")
shap.initjs()
/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv /kaggle/input/house-prices-advanced-regression-techniques/data_description.txt /kaggle/input/house-prices-advanced-regression-techniques/train.csv /kaggle/input/house-prices-advanced-regression-techniques/test.csv
In [2]:
houses = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
print(f"Rows: {houses.shape[0]:,}")
print(f"Cols: {houses.shape[1]:,}")
houses.head(3)
Rows: 1,460 Cols: 81
Out[2]:
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | ... | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 60 | RL | 65.0 | 8450 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal | 208500 |
| 1 | 2 | 20 | RL | 80.0 | 9600 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal | 181500 |
| 2 | 3 | 60 | RL | 68.0 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal | 223500 |
3 rows × 81 columns
In [3]:
RANDOM_STATE = 42
General Overview¶
In [4]:
dtype_groups = houses.columns.to_series().groupby(houses.dtypes).apply(list)
int_cols, float_cols, object_cols = [], [], []
for dtype, columns in dtype_groups.items():
if dtype == 'int64': int_cols = columns
elif dtype =='float64': float_cols = columns
else: object_cols = columns
print(f"There are {len(columns):3} columns of type {dtype}:")
print(columns)
print("\n")
There are 35 columns of type int64: ['Id', 'MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice'] There are 3 columns of type float64: ['LotFrontage', 'MasVnrArea', 'GarageYrBlt'] There are 43 columns of type object: ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
In [5]:
print(houses.dtypes)
Id int64
MSSubClass int64
MSZoning object
LotFrontage float64
LotArea int64
...
MoSold int64
YrSold int64
SaleType object
SaleCondition object
SalePrice int64
Length: 81, dtype: object
In [6]:
ID = 'Id'
TARGET = 'SalePrice'
Numerical Variables¶
In [7]:
num_features = int_cols + float_cols
num_features = [x for x in num_features if x not in [ID, TARGET]]
In [8]:
temp = houses[num_features]
stats = pd.DataFrame({
'Variable Name': temp.columns,
'Missing Count': temp.isnull().sum().values,
'Missing Rate (%)': 100*temp.isnull().mean().values,
'Unique Count': temp.nunique().values,
'Unique Rate (%)': 100*temp.nunique().values / len(temp)
}).sort_values(['Missing Rate (%)'], ascending=False)
stats
Out[8]:
| Variable Name | Missing Count | Missing Rate (%) | Unique Count | Unique Rate (%) | |
|---|---|---|---|---|---|
| 33 | LotFrontage | 259 | 17.739726 | 110 | 7.534247 |
| 35 | GarageYrBlt | 81 | 5.547945 | 97 | 6.643836 |
| 34 | MasVnrArea | 8 | 0.547945 | 327 | 22.397260 |
| 26 | EnclosedPorch | 0 | 0.000000 | 120 | 8.219178 |
| 20 | TotRmsAbvGrd | 0 | 0.000000 | 12 | 0.821918 |
| 21 | Fireplaces | 0 | 0.000000 | 4 | 0.273973 |
| 22 | GarageCars | 0 | 0.000000 | 5 | 0.342466 |
| 23 | GarageArea | 0 | 0.000000 | 441 | 30.205479 |
| 24 | WoodDeckSF | 0 | 0.000000 | 274 | 18.767123 |
| 25 | OpenPorchSF | 0 | 0.000000 | 202 | 13.835616 |
| 27 | 3SsnPorch | 0 | 0.000000 | 20 | 1.369863 |
| 1 | LotArea | 0 | 0.000000 | 1073 | 73.493151 |
| 28 | ScreenPorch | 0 | 0.000000 | 76 | 5.205479 |
| 29 | PoolArea | 0 | 0.000000 | 8 | 0.547945 |
| 30 | MiscVal | 0 | 0.000000 | 21 | 1.438356 |
| 31 | MoSold | 0 | 0.000000 | 12 | 0.821918 |
| 32 | YrSold | 0 | 0.000000 | 5 | 0.342466 |
| 19 | KitchenAbvGr | 0 | 0.000000 | 4 | 0.273973 |
| 0 | MSSubClass | 0 | 0.000000 | 15 | 1.027397 |
| 17 | HalfBath | 0 | 0.000000 | 3 | 0.205479 |
| 8 | BsmtUnfSF | 0 | 0.000000 | 780 | 53.424658 |
| 2 | OverallQual | 0 | 0.000000 | 10 | 0.684932 |
| 3 | OverallCond | 0 | 0.000000 | 9 | 0.616438 |
| 4 | YearBuilt | 0 | 0.000000 | 112 | 7.671233 |
| 5 | YearRemodAdd | 0 | 0.000000 | 61 | 4.178082 |
| 6 | BsmtFinSF1 | 0 | 0.000000 | 637 | 43.630137 |
| 7 | BsmtFinSF2 | 0 | 0.000000 | 144 | 9.863014 |
| 9 | TotalBsmtSF | 0 | 0.000000 | 721 | 49.383562 |
| 16 | FullBath | 0 | 0.000000 | 4 | 0.273973 |
| 10 | 1stFlrSF | 0 | 0.000000 | 753 | 51.575342 |
| 11 | 2ndFlrSF | 0 | 0.000000 | 417 | 28.561644 |
| 12 | LowQualFinSF | 0 | 0.000000 | 24 | 1.643836 |
| 13 | GrLivArea | 0 | 0.000000 | 861 | 58.972603 |
| 14 | BsmtFullBath | 0 | 0.000000 | 4 | 0.273973 |
| 15 | BsmtHalfBath | 0 | 0.000000 | 3 | 0.205479 |
| 18 | BedroomAbvGr | 0 | 0.000000 | 8 | 0.547945 |
In [9]:
plt.figure(figsize=(8, 4))
plt.hist(houses[TARGET], bins=50, edgecolor='white', linewidth=1.5, color='#4682B4', alpha=0.7);
plt.xlabel(TARGET)
plt.title("Distribution of Target SalePrice", fontsize=12)
ax = plt.gca()
ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda y, pos: f'{int(y/1_000)}k'))
plt.show()
In [10]:
fig, axes = plt.subplots(nrows=8, ncols=5, figsize=(15, 18))
color = '#4682B4'
axes = axes.flatten()
for index, col in enumerate(num_features):
ax = axes[index]
ax.hist(houses[col], color=color, bins=20, edgecolor='white', alpha=0.7)
ax.set_title(f"{col}", fontsize=12)
ax.set_ylabel("Frequency", fontsize=10)
ax.grid(True, linestyle='--', alpha=0.6)
for i in range(len(num_features), len(axes)):
fig.delaxes(axes[i])
plt.tight_layout(rect=[0, 0, 1, 0.98])
plt.show()
In [11]:
plt.figure(figsize=(25, 15))
sns.heatmap(houses[num_features].corr(), annot=True, cmap='coolwarm', linewidths=0.5, fmt='.2f')
plt.title('Data Correlation Heatmap (Numerical Variables)', fontsize=12)
plt.show()
Categorical Variables¶
In [12]:
temp = houses[object_cols].copy()
stats = pd.DataFrame({
'Variable Name': temp.columns,
'Missing Count': temp.isnull().sum().values,
'Missing Rate (%)': 100*temp.isnull().mean().values,
'Unique Count': temp.nunique().values,
'Unique Rate (%)': 100*temp.nunique().values / len(temp)
}).sort_values(['Missing Rate (%)'], ascending=False)
stats
Out[12]:
| Variable Name | Missing Count | Missing Rate (%) | Unique Count | Unique Rate (%) | |
|---|---|---|---|---|---|
| 38 | PoolQC | 1453 | 99.520548 | 3 | 0.205479 |
| 40 | MiscFeature | 1406 | 96.301370 | 4 | 0.273973 |
| 2 | Alley | 1369 | 93.767123 | 2 | 0.136986 |
| 39 | Fence | 1179 | 80.753425 | 4 | 0.273973 |
| 17 | MasVnrType | 872 | 59.726027 | 3 | 0.205479 |
| 32 | FireplaceQu | 690 | 47.260274 | 5 | 0.342466 |
| 33 | GarageType | 81 | 5.547945 | 6 | 0.410959 |
| 36 | GarageCond | 81 | 5.547945 | 5 | 0.342466 |
| 35 | GarageQual | 81 | 5.547945 | 5 | 0.342466 |
| 34 | GarageFinish | 81 | 5.547945 | 3 | 0.205479 |
| 25 | BsmtFinType2 | 38 | 2.602740 | 6 | 0.410959 |
| 23 | BsmtExposure | 38 | 2.602740 | 4 | 0.273973 |
| 24 | BsmtFinType1 | 37 | 2.534247 | 6 | 0.410959 |
| 21 | BsmtQual | 37 | 2.534247 | 4 | 0.273973 |
| 22 | BsmtCond | 37 | 2.534247 | 4 | 0.273973 |
| 29 | Electrical | 1 | 0.068493 | 5 | 0.342466 |
| 30 | KitchenQual | 0 | 0.000000 | 4 | 0.273973 |
| 28 | CentralAir | 0 | 0.000000 | 2 | 0.136986 |
| 31 | Functional | 0 | 0.000000 | 7 | 0.479452 |
| 27 | HeatingQC | 0 | 0.000000 | 5 | 0.342466 |
| 26 | Heating | 0 | 0.000000 | 6 | 0.410959 |
| 37 | PavedDrive | 0 | 0.000000 | 3 | 0.205479 |
| 41 | SaleType | 0 | 0.000000 | 9 | 0.616438 |
| 0 | MSZoning | 0 | 0.000000 | 5 | 0.342466 |
| 1 | Street | 0 | 0.000000 | 2 | 0.136986 |
| 10 | Condition2 | 0 | 0.000000 | 8 | 0.547945 |
| 3 | LotShape | 0 | 0.000000 | 4 | 0.273973 |
| 4 | LandContour | 0 | 0.000000 | 4 | 0.273973 |
| 5 | Utilities | 0 | 0.000000 | 2 | 0.136986 |
| 6 | LotConfig | 0 | 0.000000 | 5 | 0.342466 |
| 7 | LandSlope | 0 | 0.000000 | 3 | 0.205479 |
| 8 | Neighborhood | 0 | 0.000000 | 25 | 1.712329 |
| 9 | Condition1 | 0 | 0.000000 | 9 | 0.616438 |
| 11 | BldgType | 0 | 0.000000 | 5 | 0.342466 |
| 20 | Foundation | 0 | 0.000000 | 6 | 0.410959 |
| 12 | HouseStyle | 0 | 0.000000 | 8 | 0.547945 |
| 13 | RoofStyle | 0 | 0.000000 | 6 | 0.410959 |
| 14 | RoofMatl | 0 | 0.000000 | 8 | 0.547945 |
| 15 | Exterior1st | 0 | 0.000000 | 15 | 1.027397 |
| 16 | Exterior2nd | 0 | 0.000000 | 16 | 1.095890 |
| 18 | ExterQual | 0 | 0.000000 | 4 | 0.273973 |
| 19 | ExterCond | 0 | 0.000000 | 5 | 0.342466 |
| 42 | SaleCondition | 0 | 0.000000 | 6 | 0.410959 |
There are quiet some variables with missing values. We need to handle these before we apply label encoding.
In [13]:
def handle_missing_values(data, object_cols, strategy='fill_missing'):
"""Handle missing values in categorical features."""
for col in object_cols:
if strategy == 'fill_missing':
data[col] = data[col].fillna('Missing')
elif strategy == 'most_frequent':
most_frequent = data[col].mode()[0]
data[col] = data[col].fillna(most_frequent)
return data
In [14]:
houses = handle_missing_values(houses, object_cols, strategy='fill_missing')
In [15]:
num_cols = len(object_cols)
num_plots = 5
for i in range(0, num_cols, num_plots):
fig, axes = plt.subplots(1, num_plots, figsize=(18, 3.5))
for j, col in enumerate(object_cols[i:i+num_plots]):
ax = axes[j]
sns.boxplot(x=houses[col], y=houses[TARGET], ax=ax, linewidth=2, fliersize=3, palette='viridis')
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha='center')
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda y, pos: f'{int(y/1_000)}k'))
ax.set_title(f"{TARGET} by {col}", fontsize=12)
ax.set_ylabel(TARGET, fontsize=10)
plt.tight_layout()
plt.show()
In [16]:
def label_encode_features(data, object_cols):
"""Label encode categorical features and add them as new columns with prefix 'le_'."""
label_encoders = {}
encoded_features = []
for col in object_cols:
label_encoders[col] = LabelEncoder()
encoded_col_name = f'le_{col}'
data[encoded_col_name] = label_encoders[col].fit_transform(data[col].astype(str))
encoded_features.append(encoded_col_name)
return {
'data': data,
'encoded_features': encoded_features,
'label_encoder': label_encoders
}
le_result = label_encode_features(houses, object_cols)
houses = le_result.get('data').copy()
houses[le_result.get('encoded_features')].head(3)
Out[16]:
| le_MSZoning | le_Street | le_Alley | le_LotShape | le_LandContour | le_Utilities | le_LotConfig | le_LandSlope | le_Neighborhood | le_Condition1 | ... | le_GarageType | le_GarageFinish | le_GarageQual | le_GarageCond | le_PavedDrive | le_PoolQC | le_Fence | le_MiscFeature | le_SaleType | le_SaleCondition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3 | 1 | 1 | 3 | 3 | 0 | 4 | 0 | 5 | 2 | ... | 1 | 2 | 5 | 5 | 2 | 3 | 2 | 1 | 8 | 4 |
| 1 | 3 | 1 | 1 | 3 | 3 | 0 | 2 | 0 | 24 | 1 | ... | 1 | 2 | 5 | 5 | 2 | 3 | 2 | 1 | 8 | 4 |
| 2 | 3 | 1 | 1 | 0 | 3 | 0 | 4 | 0 | 5 | 2 | ... | 1 | 2 | 5 | 5 | 2 | 3 | 2 | 1 | 8 | 4 |
3 rows × 43 columns
In [17]:
cat_features = le_result.get('encoded_features')
Feature Engineering¶
In [18]:
houses['GrLivArea_OverallQual'] = houses['GrLivArea'] * houses['OverallQual']
Preparation¶
In the following we are going to call the "classical" test set out-of-bag (OOB in short). The reason for this is simply that the "actual" test set is the one which we will submit for this competition.
In [19]:
features = num_features + cat_features
features = [x for x in features if x not in [ID, TARGET]]
In [20]:
X = houses[features]
# y = houses[TARGET]
y = np.log1p(houses[TARGET])
X_train, X_oob, y_train, y_oob = train_test_split(X, y,
test_size=0.2, # In our case, this is the Out-Of-Bag (OOB)
random_state=RANDOM_STATE)
In [21]:
# preprocessing_pipeline = Pipeline(steps=[
# ('imputer', SimpleImputer(strategy='mean')), # Impute missing values
# ('scaler', StandardScaler()) # Scale numeric features
# ])
# X_train_scaled = preprocessing_pipeline.fit_transform(X_train)
# X_oob_scaled = preprocessing_pipeline.transform(X_oob)
In [22]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
numerical_cols = num_features
categorical_cols = cat_features
# --- Preprocessing pipeline ---
preprocessor = ColumnTransformer(
transformers=[
#
# Apply imputation and scaling to numerical columns.
('num', Pipeline(steps=[
('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())
]), num_features),
#
# No scaling needed for the categorical columns.
('cat', 'passthrough', cat_features)
]
)
X_train_scaled = preprocessor.fit_transform(X_train)
X_oob_scaled = preprocessor.transform(X_oob)
Modeling¶
In [23]:
USE_HYPERPARAMETER_OPTIMIZATION = False
With Hyperparameter Optimization¶
In [24]:
PARAMETER_COMBINATIONS = 50
# Track RMSE per fold.
train_rmse_per_fold = []
val_rmse_per_fold = []
# Track predictions per fold.
train_predictions = []
val_predictions = []
# Track RMSE per epoch per fold.
train_rmse_per_epoch = []
val_rmse_per_epoch = []
param_grid = {
'n_estimators': [300, 500, 1_000], # Epochs
'max_depth': [3, 5, 7],
'learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1],
'subsample': [0.8, 1],
'colsample_bytree': [0.8, 1],
'reg_lambda': [0.01, 0.1, 1, 10, 20], # L2 regularization
'reg_alpha': [0, 0.01, 0.1, 0.5], # L1 regularization
'gamma': [0.1, 0.3, 0.5], # Regularize tree splits
}
xgb_model = XGBRegressor(early_stopping_rounds=10,
n_jobs=-1)
random_search = RandomizedSearchCV(
estimator=xgb_model,
param_distributions=param_grid,
scoring='neg_root_mean_squared_error',
cv=3,
n_iter=PARAMETER_COMBINATIONS,
verbose=1,
random_state=RANDOM_STATE,
n_jobs=-1
)
In [25]:
%%time
random_search.fit(X_train_scaled, y_train,
eval_set=[(X_oob_scaled, y_oob)],
verbose=0)
Fitting 3 folds for each of 50 candidates, totalling 150 fits CPU times: user 1.36 s, sys: 252 ms, total: 1.61 s Wall time: 53.3 s
Out[25]:
RandomizedSearchCV(cv=3,
estimator=XGBRegressor(base_score=None, booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None, device=None,
early_stopping_rounds=10,
enable_categorical=False,
eval_metric=None, feature_types=None,
gamma=None, grow_policy=None,
importance_type=None,
interaction_constraints=None,
learning_rate=No...
random_state=None, ...),
n_iter=50, n_jobs=-1,
param_distributions={'colsample_bytree': [0.8, 1],
'gamma': [0.1, 0.3, 0.5],
'learning_rate': [0.001, 0.005, 0.01,
0.05, 0.1],
'max_depth': [3, 5, 7],
'n_estimators': [300, 500, 1000],
'reg_alpha': [0, 0.01, 0.1, 0.5],
'reg_lambda': [0.01, 0.1, 1, 10, 20],
'subsample': [0.8, 1]},
random_state=42, scoring='neg_root_mean_squared_error',
verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomizedSearchCV(cv=3,
estimator=XGBRegressor(base_score=None, booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None, device=None,
early_stopping_rounds=10,
enable_categorical=False,
eval_metric=None, feature_types=None,
gamma=None, grow_policy=None,
importance_type=None,
interaction_constraints=None,
learning_rate=No...
random_state=None, ...),
n_iter=50, n_jobs=-1,
param_distributions={'colsample_bytree': [0.8, 1],
'gamma': [0.1, 0.3, 0.5],
'learning_rate': [0.001, 0.005, 0.01,
0.05, 0.1],
'max_depth': [3, 5, 7],
'n_estimators': [300, 500, 1000],
'reg_alpha': [0, 0.01, 0.1, 0.5],
'reg_lambda': [0.01, 0.1, 1, 10, 20],
'subsample': [0.8, 1]},
random_state=42, scoring='neg_root_mean_squared_error',
verbose=1)XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=10,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None, n_jobs=-1,
num_parallel_tree=None, random_state=None, ...)XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=10,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None, n_jobs=-1,
num_parallel_tree=None, random_state=None, ...)In [26]:
random_search.best_params_
Out[26]:
{'subsample': 0.8,
'reg_lambda': 1,
'reg_alpha': 0.1,
'n_estimators': 1000,
'max_depth': 3,
'learning_rate': 0.05,
'gamma': 0.1,
'colsample_bytree': 0.8}
In [27]:
best_model = random_search.best_estimator_
best_model.fit(X_train_scaled, y_train,
eval_set=[(X_train_scaled, y_train), (X_oob_scaled, y_oob)],
eval_metric='rmse',
verbose=False)
evals_result = best_model.evals_result()
train_rmse_per_epoch = evals_result['validation_0']['rmse'] # RMSE for training set
oob_rmse_per_epoch = evals_result['validation_1']['rmse'] # RMSE for OOB set
train_predictions = best_model.predict(X_train_scaled) # Training set predictions
oob_predictions = best_model.predict(X_oob_scaled) # OOB set predictions
# Reverse the log(1 + y) transformation.
train_predictions = np.expm1(train_predictions)
oob_predictions = np.expm1(oob_predictions)
`eval_metric` in `fit` method is deprecated for better compatibility with scikit-learn, use `eval_metric` in constructor or`set_params` instead.
In [28]:
train_rmse = np.sqrt(mean_squared_error(np.expm1(y_train), train_predictions))
oob_rmse = np.sqrt(mean_squared_error(np.expm1(y_oob), oob_predictions))
print(f"Final Train RMSE: {train_rmse:,.0f}")
print(f"Final OOB RMSE : {oob_rmse:,.0f}")
Final Train RMSE: 19,103 Final OOB RMSE : 28,313
In [29]:
# -------------------------------------- #
# Plot RMSE per epoch for the best model #
# -------------------------------------- #
plt.figure(figsize=(6, 4))
plt.plot(range(1, len(train_rmse_per_epoch) + 1), train_rmse_per_epoch,
label='Train RMSE', linestyle='-', lw=2, color='blue')
plt.plot(range(1, len(oob_rmse_per_epoch) + 1), oob_rmse_per_epoch,
label='OOB RMSE', linestyle='-.', lw=2, color='orange')
plt.title('Train and OOB RMSE per Epoch (Best Parameter Combination)')
plt.xlabel('Boosting Round / Epoch')
plt.ylabel('RMSE')
plt.legend()
plt.tight_layout()
plt.show()
In [30]:
%%time
# --------------------------------------------- #
# Plot RMSE for each hyperparameter combination #
# --------------------------------------------- #
train_rmse_per_iter = []
oob_rmse_per_iter = []
for i in range(len(random_search.cv_results_['params'])):
_model = XGBRegressor(**random_search.cv_results_['params'][i])
_model.fit(X_train, y_train,
eval_set=[(X_oob, y_oob)],
verbose=0)
y_train_pred = _model.predict(X_train)
y_oob_pred = _model.predict(X_oob)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
oob_rmse = np.sqrt(mean_squared_error(y_oob, y_oob_pred))
train_rmse_per_iter.append(train_rmse)
oob_rmse_per_iter.append(oob_rmse)
plt.figure(figsize=(6, 4))
plt.plot(range(1, len(train_rmse_per_iter) + 1), train_rmse_per_iter, label='Train RMSE', marker='o', markersize=2, color='blue')
plt.plot(range(1, len(oob_rmse_per_iter) + 1), oob_rmse_per_iter, label='OOB RMSE', marker='x', markersize=2, color='orange')
plt.title('Train and OOB RMSE per Parameter Combination')
plt.xlabel('Parameter Combination')
plt.ylabel('RMSE')
plt.legend()
plt.tight_layout()
plt.show()
CPU times: user 1min 59s, sys: 1.42 s, total: 2min Wall time: 1min 1s
Without Hyperparameter Optimization¶
In [31]:
xgb_model = XGBRegressor(
objective='reg:squarederror', # Required optimization metric for this competition.
random_state=RANDOM_STATE,
n_estimators=500,
learning_rate=0.05,
max_depth=6,
subsample=0.8,
colsample_bytree=0.8,
reg_alpha=0.1,
reg_lambda=2.0,
early_stopping_rounds=50
)
In [32]:
kf = KFold(n_splits=5,
shuffle=True,
random_state=RANDOM_STATE)
# Track RMSE per fold.
train_rmse_per_fold = []
val_rmse_per_fold = []
# Track predictions per fold.
train_predictions = []
val_predictions = []
# KFold Cross-Validation.
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_scaled)):
X_train_cv, X_val_cv = X_train_scaled[train_idx], X_train_scaled[val_idx]
y_train_cv, y_val_cv = y_train.iloc[train_idx], y_train.iloc[val_idx]
xgb_model.fit(X_train_cv, y_train_cv, eval_set=[(X_val_cv, y_val_cv)], verbose=False)
# Predict on training and validation.
y_train_pred = xgb_model.predict(X_train_cv)
y_val_pred = xgb_model.predict(X_val_cv)
y_train_pred = np.expm1(y_train_pred)
y_val_pred = np.expm1(y_val_pred)
# Track predictions.
train_predictions.append(y_train_pred)
val_predictions.append(y_val_pred)
# Calculate RMSE.
train_rmse = np.sqrt(mean_squared_error(np.expm1(y_train_cv), y_train_pred))
val_rmse = np.sqrt(mean_squared_error(np.expm1(y_val_cv), y_val_pred))
# Track RMSE.
train_rmse_per_fold.append(train_rmse)
val_rmse_per_fold.append(val_rmse)
print(f"Fold {fold+1} -> Train RMSE: {train_rmse:.4f}, Validation RMSE: {val_rmse:.4f}")
Fold 1 -> Train RMSE: 3029.7441, Validation RMSE: 30521.5768 Fold 2 -> Train RMSE: 4552.0018, Validation RMSE: 32904.1754 Fold 3 -> Train RMSE: 3040.2980, Validation RMSE: 30577.1073 Fold 4 -> Train RMSE: 6313.2936, Validation RMSE: 23972.4075 Fold 5 -> Train RMSE: 6781.1205, Validation RMSE: 24642.8847
In [33]:
train_mu, train_sd = np.mean(train_rmse_per_fold), np.std(train_rmse_per_fold)
val_mu, val_sd = np.mean(val_rmse_per_fold), np.std(val_rmse_per_fold)
print(f"Training RMSE: {train_mu:>7,.0f} +- {train_sd:>6,.0f}")
print(f"Validation RMSE: {val_mu:>7,.0f} +- {val_sd:>6,.0f}")
y_oob_pred = xgb_model.predict(X_oob_scaled)
oob_rmse = np.sqrt(mean_squared_error(np.expm1(y_oob),
np.expm1(y_oob_pred)))
print(f"Final OOB RMSE : {oob_rmse:7,.0f}")
Training RMSE: 4,743 +- 1,581 Validation RMSE: 28,524 +- 3,554 Final OOB RMSE : 28,011
In [34]:
TRAIN_ON_WHOLE_TRAIN = True
if TRAIN_ON_WHOLE_TRAIN:
X_train_total = np.concatenate([X_train_scaled, X_oob_scaled], axis=0)
y_train_total = np.concatenate([y_train, y_oob], axis=0)
xgb_model = xgb_model.fit(X_train_total,
y_train_total,
eval_set=[(X_oob_scaled, y_oob)],
verbose=False)
y_train_total_pred = xgb_model.predict(X_train_total)
train_total_rmse = np.sqrt(mean_squared_error(np.expm1(y_train_total),
np.expm1(y_train_total_pred)))
print(f"Total Train RMSE : {train_total_rmse:7,.0f}")
Total Train RMSE : 2,847
Stacking¶
In [35]:
# from sklearn.ensemble import StackingRegressor
# from lightgbm import LGBMRegressor
# xgb = XGBRegressor(
# objective='reg:squarederror', # Required optimization metric for this competition.
# random_state=RANDOM_STATE,
# n_estimators=500,
# learning_rate=0.05,
# max_depth=6,
# subsample=0.8,
# colsample_bytree=0.8,
# reg_alpha=0.1,
# reg_lambda=2.0,
# verbosity=0,
# n_jobs=-1,
# )
# estimators = [
# ('xgb', xgb),
# ('lgbm', LGBMRegressor(verbosity=-1, n_jobs=-1))
# ]
# stacking_regressor = StackingRegressor(estimators=estimators,
# final_estimator=XGBRegressor(n_jobs=-1),
# # cv=3,
# n_jobs=-1,
# passthrough=False,
# verbose=0,
# )
In [36]:
# %%time
# kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
# # Track RMSE per fold
# train_rmse_per_fold = []
# val_rmse_per_fold = []
# # Perform Cross-Validation
# for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_scaled)):
# X_train_cv, X_val_cv = X_train_scaled[train_idx], X_train_scaled[val_idx]
# y_train_cv, y_val_cv = y_train.iloc[train_idx], y_train.iloc[val_idx]
# # Fit the stacking model on the current fold's training data
# stacking_regressor.fit(X_train_cv, y_train_cv)
# # Make predictions for the current training and validation sets
# y_train_pred = stacking_regressor.predict(X_train_cv)
# y_val_pred = stacking_regressor.predict(X_val_cv)
# # Calculate RMSE for training and validation sets
# train_rmse = np.sqrt(mean_squared_error(y_train_cv, y_train_pred))
# val_rmse = np.sqrt(mean_squared_error(y_val_cv, y_val_pred))
# # Track RMSE
# train_rmse_per_fold.append(train_rmse)
# val_rmse_per_fold.append(oob_rmse)
# print(f"Fold {fold+1} -> Train RMSE: {train_rmse:,.0f} Validation RMSE: {val_rmse:,.0f}")
# # Calculate average RMSE across all folds
# avg_train_rmse = np.mean(train_rmse_per_fold)
# avg_val_rmse = np.mean(val_rmse_per_fold)
# print()
# print(f"Average Training RMSE: {avg_train_rmse:,.0f}")
# print(f"Average Validation RMSE: {avg_val_rmse:,.0f}")
# # Fit final model on full training data
# stacking_regressor.fit(X_train_scaled, y_train)
# # Make final predictions on OOB set
# oob_predictions = stacking_regressor.predict(X_oob_scaled)
# final_oob_rmse = np.sqrt(mean_squared_error(y_oob, oob_predictions))
# print(f"Final OOB RMSE : {final_oob_rmse:,.0f}")
Validation¶
In [37]:
model = None
if USE_HYPERPARAMETER_OPTIMIZATION:
model = best_model
train_predictions = train_predictions
val_predictions = oob_predictions
else:
model = xgb_model
train_predictions = np.concatenate(train_predictions)
val_predictions = np.concatenate(val_predictions)
In [38]:
# Evaluate on oob set.
y_oob_pred = model.predict(X_oob_scaled)
oob_rmse = np.sqrt(mean_squared_error(np.expm1(y_oob), np.expm1(y_oob_pred)))
print(f"OOB RMSE: {oob_rmse:.4f}")
OOB RMSE: 3006.8506
In [39]:
train_predictions_clean = np.where(np.isinf(train_predictions), np.nan, train_predictions)
val_predictions_clean = np.where(np.isinf(val_predictions), np.nan, val_predictions)
In [40]:
# --------------------------------------------------------- #
# Confidence Intervals for Train and Validation Predictions #
# --------------------------------------------------------- #
plt.figure(figsize=(10, 4))
sns.histplot(train_predictions_clean, kde=False,
color="#4C72B0", linewidth=.75, edgecolor='white', alpha=0.75, fill=True,
label=f"Train Predictions (Mean RMSE: {train_mu:,.0f})")
sns.histplot(val_predictions_clean, kde=False,
color="#55A868", linewidth=.75, edgecolor='white', alpha=1.0, fill=True,
label=f"Validation Predictions (Mean RMSE: {val_mu:,.0f})")
plt.axvline(np.mean(np.expm1(y_oob_pred)), color="red", linestyle="--", linewidth=1.75, label=f"OOB Prediction Mean RMSE: {oob_rmse:,.0f}")
plt.title("Comparing Train, Validation and OOB Prediction", fontsize=12)
plt.xlabel("Predicted SalePrice")
plt.ylabel("Density")
plt.legend()
ax = plt.gca()
ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda y, pos: f'{int(y/1_000)}k'))
plt.show()
# Display Train/Validation RMSE per fold.
for i, (train_rmse, val_rmse) in enumerate(zip(train_rmse_per_fold, val_rmse_per_fold), 1):
print(f"Fold {i} -> Train RMSE: {train_rmse:.4f}, Validation RMSE: {val_rmse:.4f}")
use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
Fold 1 -> Train RMSE: 3029.7441, Validation RMSE: 30521.5768 Fold 2 -> Train RMSE: 4552.0018, Validation RMSE: 32904.1754 Fold 3 -> Train RMSE: 3040.2980, Validation RMSE: 30577.1073 Fold 4 -> Train RMSE: 6313.2936, Validation RMSE: 23972.4075 Fold 5 -> Train RMSE: 6781.1205, Validation RMSE: 24642.8847
In [41]:
y_pred_train = model.predict(X_train_scaled)
y_pred_oob = model.predict(X_oob_scaled)
In [42]:
def plot_with_histograms(y_train, y_pred_train, y_oob, y_pred_oob):
"""Plots train and oob predictions with histograms on the right and top."""
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# --- Train Predictions vs True Values ---
ax = axes[0]
ax.scatter(y_train, y_pred_train, color="#4C72B0", alpha=0.6, s=40, edgecolor='white')
ax.plot([min(y_train), max(y_train)], [min(y_train), max(y_train)], '--', color='red')
ax.set_xlabel("True SalePrice")
ax.set_ylabel("Predicted SalePrice")
# --- OOB Predictions vs True Values ---
ax = axes[1]
ax.scatter(y_oob, y_pred_oob, color="#55A868", alpha=0.6, s=40, edgecolor='white')
ax.plot([min(y_oob), max(y_oob)], [min(y_oob), max(y_oob)], '--', color='red')
ax.set_xlabel("True SalePrice")
ax.set_ylabel("Predicted SalePrice")
for ax in axes:
ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: f'{int(x/1_000)}k'))
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda y, pos: f'{int(y/1_000)}k'))
for i, (ax, (y_true, y_pred)) in enumerate(zip(axes, [(y_train, y_pred_train), (y_oob, y_pred_oob)])):
divider = make_axes_locatable(ax)
ax_histx = divider.append_axes("top", 0.8, pad=0.1, sharex=ax)
ax_histx.hist(y_pred, bins=30, color='#FF6347', alpha=0.6, edgecolor='white')
ax_histx.set_ylabel('Count')
ax_histx.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: f'{int(x/1_000)}k'))
ax_histy = divider.append_axes("right", 0.8, pad=0.1, sharey=ax)
ax_histy.hist(y_true, bins=30, color='#9370DB', alpha=0.6, orientation='horizontal', edgecolor='white')
ax_histy.set_xlabel('Count')
ax_histy.yaxis.set_major_formatter(ticker.FuncFormatter(lambda y, pos: f'{int(y/1_000)}k'))
if i == 0:
ax_histx.set_title('Train Predictions vs True Values', fontsize=12)
if i == 1:
ax_histx.set_title('OOB Predictions vs True Values', fontsize=12)
plt.setp(ax_histx.get_xticklabels(), visible=False)
plt.setp(ax_histy.get_yticklabels(), visible=False)
plot_with_histograms(np.expm1(y_train),
np.expm1(y_pred_train),
np.expm1(y_oob),
np.expm1(y_pred_oob))
Let's take a look at the spread of errors.
In [43]:
def plot_residuals(y_true_train, y_pred_train, y_true_oob, y_pred_oob):
residuals_train = y_true_train - y_pred_train
residuals_oob = y_true_oob - y_pred_oob
fig, axes = plt.subplots(1, 2, figsize=(10, 4))
# --- Train Residuals ---
axes[0].scatter(y_true_train, residuals_train, color="#4C72B0", s=40, edgecolor='white', alpha=0.6)
axes[0].axhline(0, linestyle='--', color='red')
axes[0].set_title("Train Residuals", fontsize=12)
axes[0].set_xlabel("True SalePrice")
axes[0].set_ylabel("Residuals")
# --- oob Residuals ---
axes[1].scatter(y_true_oob, residuals_oob, color="#55A868", s=40, edgecolor='white', alpha=0.6)
axes[1].axhline(0, linestyle='--', color='red')
axes[1].set_title("OOB Residuals", fontsize=12)
axes[1].set_xlabel("True SalePrice")
axes[1].set_ylabel("Residuals")
axes[0].xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: f'{int(x/1_000)}k'))
axes[0].yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: f'{int(x/1_000)}k'))
axes[1].xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: f'{int(x/1_000)}k'))
axes[1].yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: f'{int(x/1_000)}k'))
plt.tight_layout()
plt.show()
plot_residuals(np.expm1(y_train),
np.expm1(y_pred_train),
np.expm1(y_oob),
np.expm1(y_pred_oob))
Feature Importances¶
In [44]:
# Calculate SHAP values.
explainer = shap.Explainer(model)
shap_values = explainer.shap_values(X_oob_scaled)
In [45]:
assert shap_values.shape[0] == X_oob.shape[0], "ERROR: Mismatch in number of samples!"
assert shap_values.shape[1] == X_oob.shape[1], "ERROR: Mismatch in number of features!"
# --- SHAP Summary Plot on Test Set Sample ---
shap.summary_plot(shap_values, X_oob_scaled, feature_names=features, plot_size=(15, 6))
In [46]:
shap.summary_plot(shap_values, X_oob_scaled, plot_type='bar', feature_names=features, plot_size=(6, 6))
In [47]:
shap.force_plot(base_value=explainer.expected_value,
shap_values=shap_values,
features=X_oob_scaled,
feature_names=features)
Out[47]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [48]:
shap.plots.force(base_value=explainer.expected_value,
shap_values=shap_values[0],
feature_names=features)
Out[48]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [49]:
shap.plots.force(base_value=explainer.expected_value,
shap_values=shap_values[1],
feature_names=features)
Out[49]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [50]:
shap.plots.force(base_value=explainer.expected_value,
shap_values=shap_values[2],
feature_names=features)
Out[50]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [51]:
explanation = explainer(X_oob)
In [52]:
shap.plots.scatter(explanation[:, 'OverallQual'], color=explanation[:, 'le_KitchenQual'],
dot_size=30, alpha=0.7, x_jitter=0.4, cmap=plt.get_cmap('viridis'))
In [53]:
shap.plots.scatter(explanation[:, explanation.abs.mean(0).argsort[-1]], color=explanation,
title='1st Most Important Feature',
dot_size=30, alpha=0.7, x_jitter=0.4, cmap=plt.get_cmap('viridis'))
In [54]:
shap.plots.scatter(explanation[:, explanation.abs.mean(0).argsort[-1]], color=explanation[:, 'le_KitchenQual'],
title='1st Most Important Feature',
dot_size=30, alpha=0.7, x_jitter=0.4, cmap=plt.get_cmap('viridis'))
In [55]:
shap.plots.scatter(explanation[:, explanation.abs.mean(0).argsort[-2]], color=plt.get_cmap('viridis')(0.0),
title='2nd Most Important Feature',
dot_size=30, alpha=0.7)
In [56]:
shap.plots.scatter(explanation[:, explanation.abs.mean(0).argsort[-3]], color=plt.get_cmap('viridis')(0.0),
title='3rd Most Important Feature',
dot_size=30, alpha=0.7)
In [57]:
fig, ax = plt.subplots(tight_layout=True, figsize=(10, 5))
scatter = ax.scatter(
explanation[:, explanation.abs.mean(0).argsort[-3]].data,
explanation[:, explanation.abs.mean(0).argsort[-3]].values,
c=explanation[:, explanation.abs.mean(0).argsort[-1]].data,
marker="^",
cmap=plt.get_cmap("rainbow"),
rasterized=True,
zorder=5,
)
cbar = plt.colorbar(scatter, aspect=50, format="%2.1f")
cbar.set_label(f"1st", fontsize=14)
cbar.outline.set_visible(False)
ax.set_title("Customization", fontsize=18)
ax.set_xlabel("2nd", fontsize=16)
ax.set_ylabel("SHAP value for\n2nd", fontsize=16)
ax.tick_params(labelsize=14)
ax.grid(linestyle="--", color="gray", linewidth=0.5, zorder=0, alpha=0.5)
plt.show()
In [58]:
differences = np.abs(np.expm1(y_oob_pred) - np.expm1(y_oob))
closest_index = np.argmin(differences)
farthest_index = np.argmax(differences)
shap_values_closest = explainer.shap_values(X_oob.iloc[[closest_index]])
shap_values_farthest = explainer.shap_values(X_oob.iloc[[farthest_index]])
In [59]:
plt.figure(figsize=(6,6))
shap.decision_plot(explainer.expected_value,
shap_values[closest_index],
X_oob.iloc[closest_index].values,
feature_names=features,
highlight=0,
title='Decision Plot of Prediction Closted to True SalePrice',
auto_size_plot=False)
plt.show()
In [60]:
plt.figure(figsize=(6,6))
shap.decision_plot(explainer.expected_value,
shap_values[farthest_index],
X_oob.iloc[farthest_index].values,
feature_names=features,
highlight=0,
title='Decision Plot of Prediction Farthest from True SalePrice',
auto_size_plot=False)
plt.show()
Submission¶
In [61]:
houses_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
print(f"Rows: {houses_test.shape[0]:,}")
print(f"Cols: {houses_test.shape[1]:,}")
houses_test.head(3)
Rows: 1,459 Cols: 80
Out[61]:
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | ... | ScreenPorch | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1461 | 20 | RH | 80.0 | 11622 | Pave | NaN | Reg | Lvl | AllPub | ... | 120 | 0 | NaN | MnPrv | NaN | 0 | 6 | 2010 | WD | Normal |
| 1 | 1462 | 20 | RL | 81.0 | 14267 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | 0 | NaN | NaN | Gar2 | 12500 | 6 | 2010 | WD | Normal |
| 2 | 1463 | 60 | RL | 74.0 | 13830 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | 0 | NaN | MnPrv | NaN | 0 | 3 | 2010 | WD | Normal |
3 rows × 80 columns
In [62]:
houses_test = handle_missing_values(houses_test, object_cols, strategy='fill_missing')
In [63]:
for obj_col in object_cols:
le = le_result.get('label_encoder').get(obj_col)
houses_test[f'le_{obj_col}'] = houses_test[obj_col].apply(lambda x: le.transform([x])[0] if x in le.classes_ else -1)
houses_test[le_result.get('encoded_features')].head(3)
Out[63]:
| le_MSZoning | le_Street | le_Alley | le_LotShape | le_LandContour | le_Utilities | le_LotConfig | le_LandSlope | le_Neighborhood | le_Condition1 | ... | le_GarageType | le_GarageFinish | le_GarageQual | le_GarageCond | le_PavedDrive | le_PoolQC | le_Fence | le_MiscFeature | le_SaleType | le_SaleCondition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2 | 1 | 1 | 3 | 3 | 0 | 4 | 0 | 12 | 1 | ... | 1 | 3 | 5 | 5 | 2 | 3 | 3 | 1 | 8 | 4 |
| 1 | 3 | 1 | 1 | 0 | 3 | 0 | 0 | 0 | 12 | 2 | ... | 1 | 3 | 5 | 5 | 2 | 3 | 2 | 0 | 8 | 4 |
| 2 | 3 | 1 | 1 | 0 | 3 | 0 | 4 | 0 | 8 | 2 | ... | 1 | 0 | 5 | 5 | 2 | 3 | 3 | 1 | 8 | 4 |
3 rows × 43 columns
In [64]:
X_sub_scaled = preprocessor.transform(houses_test[features])
In [65]:
y_sub_pred = np.expm1(model.predict(X_sub_scaled))
y_sub_pred[:10]
Out[65]:
array([122455.41, 157231.78, 178254.45, 190086. , 187468.19, 172464.73,
173588.95, 169919.52, 175365.38, 123989.9 ], dtype=float32)
In [66]:
plt.figure(figsize=(8, 4))
plt.hist(train_predictions_clean, bins=50, edgecolor='white', linewidth=1, color='#4C72B0', alpha=0.5, label=f"Train Predictions")
plt.hist(y_sub_pred, bins=50, edgecolor='white', linewidth=1, color='#9370DB', alpha=1.0, label=f"Submission Predictions");
plt.xlabel(TARGET)
plt.title("Distribution of Target SalePrice (Submission file)", fontsize=12)
plt.legend()
ax = plt.gca()
ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda y, pos: f'{int(y/1_000)}k'))
plt.show()
In [67]:
submission_df = pd.DataFrame({
ID: houses_test[ID],
TARGET: y_sub_pred
})
submission_df.to_csv('/kaggle/working/submission.csv', index=False)
print(f"Submission file 'submission.csv' created successfully!")
submission_df
Submission file 'submission.csv' created successfully!
Out[67]:
| Id | SalePrice | |
|---|---|---|
| 0 | 1461 | 122455.406250 |
| 1 | 1462 | 157231.781250 |
| 2 | 1463 | 178254.453125 |
| 3 | 1464 | 190086.000000 |
| 4 | 1465 | 187468.187500 |
| ... | ... | ... |
| 1454 | 2915 | 81067.679688 |
| 1455 | 2916 | 78969.656250 |
| 1456 | 2917 | 160173.062500 |
| 1457 | 2918 | 118705.812500 |
| 1458 | 2919 | 222827.031250 |
1459 rows × 2 columns
💚 Thank you for reading 💚
If you have any questions or feedback, feel free to leave a comment 🤔
This notebook is still in progress.
Please UPVOTE if you enjoyed this notebook 🙏